This is the code for the EDA deliverable for MSDS Project 1.
Get the data
Active library
1. How many breweries are present in each state?
##
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY
## 7 3 2 11 39 47 8 1 2 15 7 4 5 5 18 22 3 4
## LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH
## 5 23 7 9 32 12 9 2 9 19 1 5 3 3 4 2 16 15
## OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 6 29 25 5 4 1 3 28 4 16 10 23 20 1 4
Brew_Num <- BreweriesDF %>%
group_by(State) %>%
dplyr::summarize(count=n())
# Plot
Brew_Num$State = factor(Brew_Num$State, level = Brew_Num$State[order(-Brew_Num$count)])
Brew_Num %>%
ggplot(aes(x=State, y=count, fill=State))+geom_bar(stat = "identity", width = 0.8) +
ylab("Brewery")+ggtitle("Brewery vs State")+
geom_text(aes(label=count), hjust=0.4, vjust=-0.3, size=3) +
theme(axis.text.x = element_text(size=7, angle=90))2. Merge beer data with the breweries data.
Print the first 6 observations and the last six observations to check the merged file.
(RMD only, this does not need to be included in the presentation or the deck.)
mergeDF <- merge(BeersDF, BreweriesDF, by.x = "Brewery_id", by.y = "Brew_ID")
colnames(mergeDF)[2] = "Beer_name"
colnames(mergeDF)[8] = "Brewery_name"
print("The first 6 observations of the merged file:")## [1] "The first 6 observations of the merged file:"
## Brewery_id Beer_name Beer_ID ABV IBU
## 1 1 Get Together 2692 0.045 50
## 2 1 Maggie's Leap 2691 0.049 26
## 3 1 Wall's End 2690 0.048 19
## 4 1 Pumpion 2689 0.060 38
## 5 1 Stronghold 2688 0.060 25
## 6 1 Parapet ESB 2687 0.056 47
## Style Ounces Brewery_name
## 1 American IPA 16 NorthGate Brewing
## 2 Milk / Sweet Stout 16 NorthGate Brewing
## 3 English Brown Ale 16 NorthGate Brewing
## 4 Pumpkin Ale 16 NorthGate Brewing
## 5 American Porter 16 NorthGate Brewing
## 6 Extra Special / Strong Bitter (ESB) 16 NorthGate Brewing
## City State
## 1 Minneapolis MN
## 2 Minneapolis MN
## 3 Minneapolis MN
## 4 Minneapolis MN
## 5 Minneapolis MN
## 6 Minneapolis MN
## [1] "The last 6 observations of the merged file:"
## Brewery_id Beer_name Beer_ID ABV IBU
## 2405 556 Pilsner Ukiah 98 0.055 NA
## 2406 557 Heinnieweisse Weissebier 52 0.049 NA
## 2407 557 Snapperhead IPA 51 0.068 NA
## 2408 557 Moo Thunder Stout 50 0.049 NA
## 2409 557 Porkslap Pale Ale 49 0.043 NA
## 2410 558 Urban Wilderness Pale Ale 30 0.049 NA
## Style Ounces Brewery_name
## 2405 German Pilsener 12 Ukiah Brewing Company
## 2406 Hefeweizen 12 Butternuts Beer and Ale
## 2407 American IPA 12 Butternuts Beer and Ale
## 2408 Milk / Sweet Stout 12 Butternuts Beer and Ale
## 2409 American Pale Ale (APA) 12 Butternuts Beer and Ale
## 2410 English Pale Ale 12 Sleeping Lady Brewing Company
## City State
## 2405 Ukiah CA
## 2406 Garrattsville NY
## 2407 Garrattsville NY
## 2408 Garrattsville NY
## 2409 Garrattsville NY
## 2410 Anchorage AK
3. Address the missing values in each column.
Option 1: Simply replacing missing values in IBU/ABV with state median results in significant drop on correlation, from 67% to 49% (Refer to #7)
IBUNADF <- mergeDF %>% filter(is.na(IBU))
ABVNADF <- mergeDF %>% filter(is.na(ABV))
BOTHNADF <- mergeDF %>% filter(is.na(ABV) & is.na(IBU))
# Check which state give the most NA's
summary(IBUNADF$State)## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY
## 8 1 4 23 48 119 21 4 1 21 9 9 5 13 52 48 4 7
## LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH
## 9 31 11 20 124 9 13 0 17 29 0 16 6 0 8 3 28 17
## OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 8 38 53 7 9 7 1 41 15 5 10 25 45 0 3
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY
## 25 10 5 47 183 265 27 8 2 58 16 27 30 30 91 139 23 21
## LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH
## 19 82 21 27 162 55 42 11 40 59 3 25 8 8 14 11 74 49
## OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 19 125 100 27 14 7 6 130 26 40 27 68 87 2 15
## AK AL AR AZ CA CO CT DC DE FL GA HI IA ID IL IN KS KY
## 0 0 0 3 1 15 0 0 1 2 0 0 0 0 0 2 0 1
## LA MA MD ME MI MN MO MS MT NC ND NE NH NJ NM NV NY OH
## 0 0 0 0 11 0 3 0 1 4 0 4 0 0 1 1 1 0
## OK OR PA RI SC SD TN TX UT VA VT WA WI WV WY
## 0 0 3 0 0 0 0 6 0 0 0 0 2 0 0
# NADF <- data.frame(State=State, ALL=summary(mergeDF$State), IBUNA=summary(IBUNADF$State), ABVNA=summary(ABVNADF$State))
# NADF$IBUNAperc <- NADF$IBUNA/NADF$ALL
# NADF$ABVNAperc <- NADF$ABVNA/NADF$ALL
# Calculate IBU and ABV median value of each state
IBUmedianState <- mergeDF %>% filter(!is.na(IBU)) %>% group_by(State) %>% dplyr::summarise(IBUmedian=median(IBU))
ABVmedianState <- mergeDF %>% filter(!is.na(ABV)) %>% group_by(State) %>% dplyr::summarise(ABVmedian=median(ABV))
# Replace IBU NA's with state median value
mergeDF_no_IBU_NA <- left_join(mergeDF, IBUmedianState, by = "State")
mergeDF_no_IBU_NA$IBU[is.na(mergeDF_no_IBU_NA$IBU)] <- as.character(mergeDF_no_IBU_NA$IBUmedian[is.na(mergeDF_no_IBU_NA$IBU)])
aggr(mergeDF_no_IBU_NA, prop=FALSE, numbers=TRUE) # Still 7 observations IBU NA, because there is no any IBU value available for state # Replace 7 IBU NA's in SD with grand median value 35.00
mergeDF_no_IBU_NA$IBU = str_replace_na(mergeDF_no_IBU_NA$IBU, replacement = "35")
mergeDF_no_IBU_NA$IBUmedian = str_replace_na(mergeDF_no_IBU_NA$IBUmedian, replacement = "35")
# Replace ABV NA's with state median value
mergeDF_noNA <- left_join(mergeDF_no_IBU_NA, ABVmedianState, by = "State")
mergeDF_noNA$ABV[is.na(mergeDF_noNA$ABV)] <- as.character(mergeDF_noNA$ABVmedian[is.na(mergeDF_noNA$ABV)])
aggr(mergeDF_noNA, prop=FALSE, numbers=TRUE)mergeDF_noNA$IBU <- as.numeric(mergeDF_noNA$IBU)
mergeDF_noNA$ABV <- as.numeric(mergeDF_noNA$ABV)
# Plot
IBU.plotly <- mergeDF_noNA %>% ggplot(aes(x=IBU))+geom_bar()+ggtitle("IBU Distribution") +
theme(axis.text.x = element_text(size = 6, angle = 90))
ggplotly(IBU.plotly)ABV.plotly <- mergeDF_noNA %>% ggplot(aes(x=ABV))+geom_bar()+ggtitle("ABV Distribution") +
theme(axis.text.x = element_text(size = 6, angle = 90))
ggplotly(ABV.plotly)## Brewery_id Beer_name Beer_ID
## Min. : 1.0 Nonstop Hef Hop : 12 Min. : 1.0
## 1st Qu.: 94.0 Dale's Pale Ale : 6 1st Qu.: 808.2
## Median :206.0 Oktoberfest : 6 Median :1453.5
## Mean :232.7 Longboard Island Lager: 4 Mean :1431.1
## 3rd Qu.:367.0 1327 Pod's ESB : 3 3rd Qu.:2075.8
## Max. :558.0 Boston Lager : 3 Max. :2692.0
## (Other) :2376
## ABV IBU Style
## Min. :0.00100 Min. : 4.00 American IPA : 424
## 1st Qu.:0.05000 1st Qu.: 26.00 American Pale Ale (APA) : 245
## Median :0.05700 Median : 35.00 American Amber / Red Ale : 133
## Mean :0.05973 Mean : 39.81 American Blonde Ale : 108
## 3rd Qu.:0.06700 3rd Qu.: 47.00 American Double / Imperial IPA: 105
## Max. :0.12800 Max. :138.00 American Pale Wheat Ale : 97
## (Other) :1298
## Ounces Brewery_name City
## Min. : 8.40 Brewery Vivant : 62 Grand Rapids: 66
## 1st Qu.:12.00 Oskar Blues Brewery : 46 Portland : 64
## Median :12.00 Sun King Brewing Company : 38 Chicago : 55
## Mean :13.59 Cigar City Brewing Company: 25 Indianapolis: 43
## 3rd Qu.:16.00 Sixpoint Craft Ales : 24 San Diego : 42
## Max. :32.00 Hopworks Urban Brewery : 23 Boulder : 41
## (Other) :2192 (Other) :2099
## State IBUmedian ABVmedian
## CO : 265 Length:2410 Min. :0.04000
## CA : 183 Class :character 1st Qu.:0.05500
## MI : 162 Mode :character Median :0.05700
## IN : 139 Mean :0.05677
## TX : 130 3rd Qu.:0.05800
## OR : 125 Max. :0.06250
## (Other):1406
Option 2: Alternative option is to use Simple Linear Regression model to predict IBU missing values, to keep the correlation between IBU/ABV no change.
# Calculate ABV median value of each state
ABVmedianState <- mergeDF %>% filter(!is.na(ABV)) %>% group_by(State) %>%
dplyr::summarise(ABVmedian=median(ABV))
# Replace 62 NA's in ABV with state level median
mergeDF_no_ABV_NA <- left_join(mergeDF, ABVmedianState, by = "State")
mergeDF_no_ABV_NA$ABV[is.na(mergeDF_no_ABV_NA$ABV)] <-
mergeDF_no_ABV_NA$ABVmedian[is.na(mergeDF_no_ABV_NA$ABV)]
aggr(mergeDF_no_ABV_NA, prop=FALSE, numbers=TRUE)# Build simple regression model with 1405 pairs of IBU/ABV to get the correlation coef.
mergeDF_tidy <- mergeDF %>% filter(!is.na(IBU) & !is.na(ABV))
mod <- lm(mergeDF_tidy$IBU ~ mergeDF_tidy$ABV)
summary(mod)##
## Call:
## lm(formula = mergeDF_tidy$IBU ~ mergeDF_tidy$ABV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.849 -11.977 -0.721 13.997 93.458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.099 2.326 -14.66 <2e-16 ***
## mergeDF_tidy$ABV 1282.037 37.860 33.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.26 on 1403 degrees of freedom
## Multiple R-squared: 0.4497, Adjusted R-squared: 0.4493
## F-statistic: 1147 on 1 and 1403 DF, p-value: < 2.2e-16
Intercept <- mod$coefficients[1]
Slope <- mod$coefficients[2]
# Add new column predicted_IBU base on correlation coef.
# Change negative predicted_IBU to 0, IBU is measured on a scale from 0 to infinite
mergeDF_no_ABV_NA$predict_IBU <- pmax(Intercept + Slope*mergeDF_no_ABV_NA$ABV, 0)
summary(mergeDF_no_ABV_NA$IBU)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 4.00 21.00 35.00 42.71 64.00 138.00 1005
# Replace 1005 NA's in IBU with predicted_IBU
mergeDF_no_ABV_NA$IBU[is.na(mergeDF_no_ABV_NA$IBU)] <-
mergeDF_no_ABV_NA$predict_IBU[is.na(mergeDF_no_ABV_NA$IBU)]
summary(mergeDF_no_ABV_NA$IBU)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 26.00 36.41 42.49 55.64 138.00
4. Compute the median alcohol content(ABV) and international bitterness unit (IBU) for each state. Plot a bar chart to compare.
# plot IBU median distribution using IBUmedianState data frame
## Calculate IBU and ABV median value of each state
IBUmedianState <- tidyDF %>% filter(!is.na(IBU)) %>% group_by(State) %>% dplyr::summarise(IBUmedian=median(IBU))
ABVmedianState <- tidyDF %>% filter(!is.na(ABV)) %>% group_by(State) %>% dplyr::summarise(ABVmedian=median(ABV))
summary(IBUmedianState)## State IBUmedian
## AK : 1 Min. :22.00
## AL : 1 1st Qu.:35.00
## AR : 1 Median :37.00
## AZ : 1 Mean :37.65
## CA : 1 3rd Qu.:40.63
## CO : 1 Max. :57.50
## (Other):45
## Plot
IBUmedianState$State <- factor(IBUmedianState$State, level = IBUmedianState$State[order(IBUmedianState$IBUmedian)])
IBUmedianState %>% ggplot(aes(x=State, y=IBUmedian, fill=State)) + geom_bar(stat = "identity") + ggtitle("IBU Median By State")+
geom_text(aes(label=sprintf("%0.2f", round(IBUmedian,digits = 2))), hjust=-0.5) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()# plot ABV median distribution using ABVmedianState data frame
ABVmedianState$State <- factor(ABVmedianState$State, level = ABVmedianState$State[order(ABVmedianState$ABVmedian)])
ABVmedianState %>% ggplot(aes(x=State, y=ABVmedian, fill=State)) + geom_bar(stat = "identity") + ggtitle("ABV Median By State")+
geom_text(aes(label=sprintf("%0.2f", round(ABVmedian,digits = 2))), hjust=-0.5) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()## State ABVmedian
## UT : 1 Min. :0.04000
## NJ : 1 1st Qu.:0.05500
## KS : 1 Median :0.05600
## ND : 1 Mean :0.05585
## WY : 1 3rd Qu.:0.05800
## ME : 1 Max. :0.06250
## (Other):45
# plot scatter with 51 pairs of median values of ABV and IBU
ABV_IBU_mergeDF <- merge(ABVmedianState, IBUmedianState, by = "State" )
ABV_IBU_mergeDF$State <- factor(ABV_IBU_mergeDF$State, droplevels(ABV_IBU_mergeDF$State))
ABV_IBU_mergeDF %>% ggplot(aes(x=IBUmedian, y=ABVmedian)) + geom_point()5. Which state has the maximum alcoholic (ABV) beer? Which state has the most bitter (IBU) beer?
# Calculate IBU and ABV max value of each state
IBUmaxState <- tidyDF %>% filter(!is.na(IBU)) %>% group_by(State) %>% dplyr::summarise(IBUmax=max(IBU))
ABVmaxState <- tidyDF %>% filter(!is.na(ABV)) %>% group_by(State) %>% dplyr::summarise(ABVmax=max(ABV))
# plot IBU max distribution using IBUmedianState data frame
IBUmaxState$State <- factor(IBUmaxState$State, level = IBUmaxState$State[order(IBUmaxState$IBUmax)])
IBUmaxState %>% ggplot(aes(x=State, y=IBUmax, fill=State)) + geom_bar(stat = "identity") + ggtitle("IBU max by State")+
geom_text(aes(label=sprintf("%0.2f", round(IBUmax,digits = 2))), hjust=+0.5) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()# plot ABV max distribution using ABVmaxState data frame
ABVmaxState$State <- factor(ABVmaxState$State, level = ABVmaxState$State[order(ABVmaxState$ABVmax)])
ABVmaxState %>% ggplot(aes(x=State, y=ABVmax, fill=State)) + geom_bar(stat = "identity") + ggtitle("ABV max in States")+
geom_text(aes(label=sprintf("%0.2f", round(ABVmax,digits = 2))), hjust=+0.5) +
theme(axis.text.y = element_text(size = 6)) +
coord_flip()7. Is there an apparent relationship between the bitterness of the beer and its alcoholic content? Draw a scatter plot. Make your best judgment of a relationship and EXPLAIN your answer.
# Scatter plot and Pearson's correlation with original data
mergeDF %>% filter(!is.na(IBU) & !is.na(ABV)) %>%
ggplot(aes(x=ABV, y=IBU)) +
geom_point(aes(color=State), size=1, position="jitter") +
geom_smooth(method = lm) +
theme_wsj() +
xlab("ABV") + ylab("IBU") +
ggtitle("Relationship between IBU and ABV")## Warning: plotly.js does not (yet) support horizontal legend items
## You can track progress here:
## https://github.com/plotly/plotly.js/issues/53
mergeDF_tidy <- mergeDF %>% filter(!is.na(IBU) & !is.na(ABV))
cor.test(mergeDF_tidy$IBU, mergeDF_tidy$ABV)##
## Pearson's product-moment correlation
##
## data: mergeDF_tidy$IBU and mergeDF_tidy$ABV
## t = 33.863, df = 1403, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6407982 0.6984238
## sample estimates:
## cor
## 0.6706215
##
## Call:
## lm(formula = mergeDF_tidy$IBU ~ mergeDF_tidy$ABV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.849 -11.977 -0.721 13.997 93.458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -34.099 2.326 -14.66 <2e-16 ***
## mergeDF_tidy$ABV 1282.037 37.860 33.86 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.26 on 1403 degrees of freedom
## Multiple R-squared: 0.4497, Adjusted R-squared: 0.4493
## F-statistic: 1147 on 1 and 1403 DF, p-value: < 2.2e-16
# Scatter plot and Pearson's correlation with tidy data (no missing value)
tidyDF %>% ggplot(aes(x=ABV, y=IBU)) +
geom_point(aes(color=State), size=1, position="jitter") +
geom_smooth(method = lm) +
ggtitle("Relationship between IBU and ABV")##
## Pearson's product-moment correlation
##
## data: tidyDF$IBU and tidyDF$ABV
## t = 57.005, df = 2408, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7403535 0.7743731
## sample estimates:
## cor
## 0.757878
##
## Call:
## lm(formula = tidyDF$IBU ~ tidyDF$ABV)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.718 -4.712 -0.035 2.762 93.442
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.818 1.372 -24.65 <2e-16 ***
## tidyDF$ABV 1277.567 22.411 57.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.72 on 2408 degrees of freedom
## Multiple R-squared: 0.5744, Adjusted R-squared: 0.5742
## F-statistic: 3250 on 1 and 2408 DF, p-value: < 2.2e-16
#Correlations
Corl <- inspect_cor(mergeDF)
show_plot (Corl) + ggtitle("Correlation Between All Attributes")# Sctter Plot
mergeDF %>% ggplot(aes(x=IBU, y=ABV)) + geom_point(aes(color=State), size=1, position="jitter") + geom_smooth()## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1005 rows containing non-finite values (stat_smooth).
## Warning: Removed 1005 rows containing missing values (geom_point).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
8. Budweiser would also like to investigate the difference with respect to IBU and ABV between IPAs (India Pale Ales) and other types of Ale (any beer with “Ale” in its name other than IPA). You decide to use KNN clustering to investigate this relationship. Provide statistical evidence one way or the other. You can of course assume your audience is comfortable with percentages … KNN is very easy to understand.
##
## Ale IPA Other
## 1007 571 832
# Check the Ale: 2nd time with regex(): # Ale:1007, IPA:571, Other:832
tidyDF.knn <- tidyDF %>% filter(IPA_Ale == "IPA" | IPA_Ale == "Ale")
##
set.seed(1234)
splitPerc = .75
iterations = 50
numks = 30
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(tidyDF.knn)[1],round(splitPerc * dim(tidyDF.knn)[1]))
train = tidyDF.knn[trainIndices,]
test = tidyDF.knn[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPA_Ale, prob = TRUE, k = i)
table(classifications,test$IPA_Ale)
CM = confusionMatrix(table(classifications,test$IPA_Ale))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
maxMeanAcc = max(MeanAcc) # k=?
plot(seq(1,numks,1),MeanAcc, type = "l")classifications5 = knn(train[,c(4,5)],test[,c(4,5)],train$IPA_Ale, prob = TRUE, k = 5)
table(classifications5,test$IPA_Ale)##
## classifications5 Ale IPA
## Ale 199 41
## IPA 42 112
## Confusion Matrix and Statistics
##
##
## classifications5 Ale IPA
## Ale 199 41
## IPA 42 112
##
## Accuracy : 0.7893
## 95% CI : (0.7457, 0.8286)
## No Information Rate : 0.6117
## P-Value [Acc > NIR] : 3.385e-14
##
## Kappa : 0.5571
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.8257
## Specificity : 0.7320
## Pos Pred Value : 0.8292
## Neg Pred Value : 0.7273
## Prevalence : 0.6117
## Detection Rate : 0.5051
## Detection Prevalence : 0.6091
## Balanced Accuracy : 0.7789
##
## 'Positive' Class : Ale
##
set.seed(1234)
splitPerc = .80
iterations = 50
numks = 30
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(30), k = numeric(30))
trainIndices = sample(1:dim(tidyDF.knn)[1],round(splitPerc * dim(tidyDF.knn)[1]))
train = tidyDF.knn[trainIndices,]
test = tidyDF.knn[-trainIndices,]
for(i in 1:numks)
{
classifications = knn(train[,c(4,5)],test[,c(4,5)],train$IPA_Ale, prob = TRUE, k = i)
table(classifications,test$IPA_Ale)
CM = confusionMatrix(table(classifications,test$IPA_Ale))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
maxMeanAcc = max(MeanAcc) # k=?
plot(seq(1,numks,1),MeanAcc, type = "l")classifications5 = knn(train[,c(4,5)],test[,c(4,5)],train$IPA_Ale, prob = TRUE, k = 6)
table(classifications5,test$IPA_Ale)##
## classifications5 Ale IPA
## Ale 182 35
## IPA 25 74
## Confusion Matrix and Statistics
##
##
## classifications5 Ale IPA
## Ale 182 35
## IPA 25 74
##
## Accuracy : 0.8101
## 95% CI : (0.7625, 0.8519)
## No Information Rate : 0.6551
## P-Value [Acc > NIR] : 8.741e-10
##
## Kappa : 0.5705
##
## Mcnemar's Test P-Value : 0.2453
##
## Sensitivity : 0.8792
## Specificity : 0.6789
## Pos Pred Value : 0.8387
## Neg Pred Value : 0.7475
## Prevalence : 0.6551
## Detection Rate : 0.5759
## Detection Prevalence : 0.6867
## Balanced Accuracy : 0.7791
##
## 'Positive' Class : Ale
##
Check Budweiser profile
Neither Budweiser nor Anheuser-Busch can be found in list of Brewery_name
We cannot build Budweiser profile with the available data set
Benchmarking with competitors will not be realistic
tidyDF$BudweiserOrOther <- ifelse(str_detect(tidyDF$Brewery_name,regex("Budweiser", ignore_case = TRUE)),"Budweiser","Other")
table(tidyDF$BudweiserOrOther)##
## Other
## 2410
## [1] Brewery_id Beer_name Beer_ID ABV
## [5] IBU Style Ounces Brewery_name
## [9] City State ABVmedian predict_IBU
## [13] IPA_Ale BudweiserOrOther
## <0 rows> (or 0-length row.names)
9. Knock their socks off! Find one other useful inference from the data that you feel Budweiser may be able to find value in. You must convince them why it is important and back up your conviction with appropriate statistical evidence.
lookup = data.frame(abb=state.abb, state= state.name )
#View(lookup)
# add new column to tidyDF with lower case state codes to user to merge with lookup DF
tidyDF$abb <- tidyDF$State
#View(tidyDF)
# create a new data set with BrewriesDF and lookup data
BrewriesDF1 = merge(tidyDF,lookup,"abb")
#View(BrewriesDF1)
# count # of sate in Acu2 data frame
BrewriesMPD= count(BrewriesDF1,"state")
#View(BrewriesMPD)
# Change the name of the count column to Brewreies Count
colnames(BrewriesMPD)[2] = "BrewriesCnt"
#change the names of the states to lower case.
BrewriesMPD$region <- tolower(BrewriesMPD$state)
# get the longi/lat of the states in order to draw the map
states <- map_data("state")
#View(states)
#merge the states long/lat data with AcuMap data
map.df <- left_join(states,BrewriesMPD, by = "region")
#View(map.df)
# Plot the data on map
ggplot(map.df, aes(long, lat, group = group))+
geom_polygon(aes(fill = BrewriesCnt ), stat= "identity", color = "white")+
#geom_text(aes(label=state), hjust=0.5, vjust=0.5, size=3)
#geom_text(aes(label = BrewriesMPD$count, )) +
scale_fill_gradientn (colours= (heat.colors(3)),na.value="grey90")+
ggtitle( "Breweries by State", subtitle = "Geopositional Analysis")# Analyze External Datasource AlcoholDF
AlcoholDF$region <- tolower(AlcoholDF$State)
# AlcoholDF$NumTot <- as.numeric(AlcoholDF$Total)
AlcoholDF$NumTot <- as.numeric(AlcoholDF$GreaterThan12Est)
AlcoholDF$chaTot<- as.character(AlcoholDF$NumTot)
AlcoholMap.df <- left_join(states,AlcoholDF, by = "region")
View(AlcoholMap.df)
ggplot(AlcoholMap.df, aes(long, lat, group = group)) +
geom_polygon(aes(fill = GreaterThan12Est), stat= "identity", color = "white") +
#geom_text(aes(label=state), hjust=0.5, vjust=0.5, size=3)
#geom_text(aes(label = BrewriesMPD$count, )) +
scale_fill_gradientn (colours= (heat.colors(3)),na.value="grey90")+
ggtitle( "Alcohol Abuse by State", subtitle = "Geopositional Analysis")
6. Comment on the summary statistics and distribution of the ABV variable.